#include <stdio.h>
#include "time.h"
void addVecteur(int *a,int *b,int *c,int L){
int i;
for(i=0;i<L;i++)
{
c[i]=a[i]+b[i];
}
}
__global__ void addVecteurkernel(int *a,int *b,int *c,int L){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
c[idx]=a[idx]+b[idx];
}
int main(void){
clock_t startCPU, endCPU;
float cpu_time_used;
int i,*a,*b,*c,*result;
int *aCuda,*bCuda,*cCuda;
int BLOC=16;
int NTHREAD=1024;
int L=BLOC*NTHREAD;
a=(int*)calloc(L,sizeof(int));
b=(int*)calloc(L,sizeof(int));
c=(int*)calloc(L,sizeof(int));
result=(int*)calloc(L,sizeof(int));
cudaMalloc(&aCuda,L*sizeof(int));
cudaMalloc(&bCuda,L*sizeof(int));
cudaMalloc(&cCuda,L*sizeof(int));
float TimerV;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for(i=0;i<L;i++){
a[i]=i;
b[i]=9*i;
}
startCPU = clock();
addVecteur(a,b,c,L);
endCPU = clock();
cpu_time_used = ((double) (endCPU - startCPU));
printf("temps CPU en msec %f\n",cpu_time_used);
cudaMemcpy(aCuda,a,L*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(bCuda,b,L*sizeof(int),cudaMemcpyHostToDevice);
cudaEventRecord(start,0);
addVecteurkernel<<<BLOC,NTHREAD>>>(aCuda,bCuda,cCuda,L);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&TimerV, start, stop);
cudaMemcpy(result,cCuda,L*sizeof(int),cudaMemcpyDeviceToHost);
for(i=0;i<L;i++){
if(result[i]!=c[i]){
printf("error\n");
exit(0);
}
}
printf("temps GPU en ms %f\n",TimerV);
free(a);free(b);free(c);
cudaFree(aCuda);cudaFree(bCuda);cudaFree(cCuda);
return 0;
}